################################################################################
## Group Identities and Parliamentary Debates: Replication package
## Fiva, Nedregård and Øien (2025)

## Description:

# This file makes the document term matrix (dtm) used in the estimation.
## # Description:
# - Takes the text column from speeches_session_lemma.csv, which is lemmatized 
# speeches at speakerXsession level. The speeches are cleaned and lemmatized by 
# gen_data.R.
# Output is the dtm used to calculate propensities and posteriors

################################################################################

#Clear workspace and memory

gc()
rm(list = ls())

# Packages 

library(data.table)
library(textmineR)
library(stringr)


## The wd is set by master.R

data.dir   <-  "../data/2_processed_data"

# Read data----
## Text data
DT <- fread(paste(data.dir, "speeches_session_lemma.csv", sep = "/"), 
            encoding="UTF-8")

DT <- DT[, .(pid_session, text)]


#Create a DTM ----

dtm <- CreateDtm(DT$text,
                 doc_names = DT$pid_session,
                 ngram_window = c(1, 1), 
                 stopword_vec = character(0) # stopwords are already removed by gen_data.R 
)



saveRDS(dtm, paste(data.dir, "document_term_matrix.rds", sep = "/"))

